package org.apache.solr.update.processor;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;

import org.apache.solr.common.params.SolrParams;

/**
 * <p>
 * This implementation is copied from Apache Nutch.
 * </p>
 * <p>
 * An implementation of a page signature. It calculates an MD5 hash of a plain
 * text "profile" of a page.
 * </p>
 * <p>
 * The algorithm to calculate a page "profile" takes the plain text version of a
 * page and performs the following steps:
 * <ul>
 * <li>remove all characters except letters and digits, and bring all characters
 * to lower case,</li>
 * <li>split the text into tokens (all consecutive non-whitespace characters),</li>
 * <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2
 * characters),</li>
 * <li>sort the list of tokens by decreasing frequency,</li>
 * <li>round down the counts of tokens to the nearest multiple of QUANT (
 * <code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is
 * 0.01f by default, and <code>maxFreq</code> is the maximum token frequency).
 * If <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2
 * (which means that tokens with frequency 1 are always discarded).</li>
 * <li>tokens, which frequency after quantization falls below QUANT, are
 * discarded.</li>
 * <li>create a list of tokens and their quantized frequency, separated by
 * spaces, in the order of decreasing frequency.</li>
 * </ul>
 * This list is then submitted to an MD5 hash calculation.
 */
public class TextProfileSignature extends MD5Signature {

	private float quantRate;
	private float minTokenLen;

	@Override
	public void init(SolrParams params) {
		quantRate = params.getFloat("quantRate", 0.01f);
		minTokenLen = params.getInt("minTokenLen", 2);
	}

	@Override
	public byte[] getSignature() {
		return super.getSignature();
	}

	@Override
	public void add(String content) {
		HashMap<String, Token> tokens = new HashMap<String, Token>();

		StringBuilder curToken = new StringBuilder();
		int maxFreq = 0;
		for (int i = 0; i < content.length(); i++) {
			char c = content.charAt(i);
			if (Character.isLetterOrDigit(c)) {
				curToken.append(Character.toLowerCase(c));
			} else {
				if (curToken.length() > 0) {
					if (curToken.length() > minTokenLen) {
						// add it
						String s = curToken.toString();
						Token tok = tokens.get(s);
						if (tok == null) {
							tok = new Token(0, s);
							tokens.put(s, tok);
						}
						tok.cnt++;
						if (tok.cnt > maxFreq)
							maxFreq = tok.cnt;
					}
					curToken.setLength(0);
				}
			}
		}
		// check the last token
		if (curToken.length() > minTokenLen) {
			// add it
			String s = curToken.toString();
			Token tok = tokens.get(s);
			if (tok == null) {
				tok = new Token(0, s);
				tokens.put(s, tok);
			}
			tok.cnt++;
			if (tok.cnt > maxFreq)
				maxFreq = tok.cnt;
		}
		Iterator<Token> it = tokens.values().iterator();
		ArrayList<Token> profile = new ArrayList<Token>();
		// calculate the QUANT value
		int quant = Math.round(maxFreq * quantRate);
		if (quant < 2) {
			if (maxFreq > 1)
				quant = 2;
			else
				quant = 1;
		}
		while (it.hasNext()) {
			Token t = it.next();
			// round down to the nearest QUANT
			t.cnt = (t.cnt / quant) * quant;
			// discard the frequencies below the QUANT
			if (t.cnt < quant) {
				continue;
			}
			profile.add(t);
		}
		Collections.sort(profile, new TokenComparator());
		StringBuilder newText = new StringBuilder();
		it = profile.iterator();
		while (it.hasNext()) {
			Token t = it.next();
			if (newText.length() > 0)
				newText.append("\n");
			newText.append(t.toString());
		}

		super.add(newText.toString());
	}

	private static class Token {
		public int cnt;
		public String val;

		public Token(int cnt, String val) {
			this.cnt = cnt;
			this.val = val;
		}

		@Override
		public String toString() {
			return val + " " + cnt;
		}
	}

	private static class TokenComparator implements Comparator<Token> {
		public int compare(Token t1, Token t2) {
			return t2.cnt - t1.cnt;
		}
	}

}
